Analyse Exploratoire

# Moyenne
sapply(data[, num_cols], mean, na.rm = TRUE)
##                          BALANCE                BALANCE_FREQUENCY 
##                     1.024948e+03                     9.694885e-01 
##                        PURCHASES                 ONEOFF_PURCHASES 
##                     2.615234e+02                     1.175435e+02 
##           INSTALLMENTS_PURCHASES                     CASH_ADVANCE 
##                     1.441638e+02                     3.343629e+02 
##              PURCHASES_FREQUENCY       ONEOFF_PURCHASES_FREQUENCY 
##                     3.444717e-01                     7.219488e-02 
## PURCHASES_INSTALLMENTS_FREQUENCY           CASH_ADVANCE_FREQUENCY 
##                     2.779689e-01                     9.103191e-02 
##                 CASH_ADVANCE_TRX                    PURCHASES_TRX 
##                     1.491892e+00                     5.810811e+00 
##                     CREDIT_LIMIT                         PAYMENTS 
##                     2.579091e+03                     5.838450e+02 
##                 MINIMUM_PAYMENTS                 PRC_FULL_PAYMENT 
##                     3.936023e+02                     1.985161e-02 
##                           TENURE 
##                     1.200000e+01
# Médiane
sapply(data[, num_cols], median, na.rm = TRUE)
##                          BALANCE                BALANCE_FREQUENCY 
##                       955.184754                         1.000000 
##                        PURCHASES                 ONEOFF_PURCHASES 
##                       149.710000                         0.000000 
##           INSTALLMENTS_PURCHASES                     CASH_ADVANCE 
##                         0.000000                        37.483718 
##              PURCHASES_FREQUENCY       ONEOFF_PURCHASES_FREQUENCY 
##                         0.250000                         0.000000 
## PURCHASES_INSTALLMENTS_FREQUENCY           CASH_ADVANCE_FREQUENCY 
##                         0.000000                         0.083333 
##                 CASH_ADVANCE_TRX                    PURCHASES_TRX 
##                         1.000000                         3.000000 
##                     CREDIT_LIMIT                         PAYMENTS 
##                      2000.000000                       504.561160 
##                 MINIMUM_PAYMENTS                 PRC_FULL_PAYMENT 
##                       311.963409                         0.000000 
##                           TENURE 
##                        12.000000
# Mode (fonction personnalisée, car R n'a pas de mode intégré)
get_mode <- function(v) {
  uniqv <- unique(v)
  uniqv[which.max(tabulate(match(v, uniqv)))]
}
sapply(data[, num_cols], get_mode)
##                          BALANCE                BALANCE_FREQUENCY 
##                         40.90075                          1.00000 
##                        PURCHASES                 ONEOFF_PURCHASES 
##                          0.00000                          0.00000 
##           INSTALLMENTS_PURCHASES                     CASH_ADVANCE 
##                          0.00000                          0.00000 
##              PURCHASES_FREQUENCY       ONEOFF_PURCHASES_FREQUENCY 
##                          0.00000                          0.00000 
## PURCHASES_INSTALLMENTS_FREQUENCY           CASH_ADVANCE_FREQUENCY 
##                          0.00000                          0.00000 
##                 CASH_ADVANCE_TRX                    PURCHASES_TRX 
##                          0.00000                          0.00000 
##                     CREDIT_LIMIT                         PAYMENTS 
##                       1200.00000                        201.80208 
##                 MINIMUM_PAYMENTS                 PRC_FULL_PAYMENT 
##                        139.50979                          0.00000 
##                           TENURE 
##                         12.00000
# Écart-type
sapply(data[, num_cols], sd, na.rm = TRUE)
##                          BALANCE                BALANCE_FREQUENCY 
##                     8.447913e+02                     8.522890e-02 
##                        PURCHASES                 ONEOFF_PURCHASES 
##                     3.143626e+02                     2.204483e+02 
##           INSTALLMENTS_PURCHASES                     CASH_ADVANCE 
##                     2.162904e+02                     5.500022e+02 
##              PURCHASES_FREQUENCY       ONEOFF_PURCHASES_FREQUENCY 
##                     3.619499e-01                     1.111671e-01 
## PURCHASES_INSTALLMENTS_FREQUENCY           CASH_ADVANCE_FREQUENCY 
##                     3.655734e-01                     1.127375e-01 
##                 CASH_ADVANCE_TRX                    PURCHASES_TRX 
##                     1.924920e+00                     6.830659e+00 
##                     CREDIT_LIMIT                         PAYMENTS 
##                     1.707943e+03                     3.766651e+02 
##                 MINIMUM_PAYMENTS                 PRC_FULL_PAYMENT 
##                     2.615090e+02                     5.107206e-02 
##                           TENURE 
##                     0.000000e+00
# Variance
sapply(data[, num_cols], var, na.rm = TRUE)
##                          BALANCE                BALANCE_FREQUENCY 
##                     7.136723e+05                     7.263965e-03 
##                        PURCHASES                 ONEOFF_PURCHASES 
##                     9.882387e+04                     4.859745e+04 
##           INSTALLMENTS_PURCHASES                     CASH_ADVANCE 
##                     4.678154e+04                     3.025024e+05 
##              PURCHASES_FREQUENCY       ONEOFF_PURCHASES_FREQUENCY 
##                     1.310078e-01                     1.235812e-02 
## PURCHASES_INSTALLMENTS_FREQUENCY           CASH_ADVANCE_FREQUENCY 
##                     1.336439e-01                     1.270974e-02 
##                 CASH_ADVANCE_TRX                    PURCHASES_TRX 
##                     3.705318e+00                     4.665790e+01 
##                     CREDIT_LIMIT                         PAYMENTS 
##                     2.917069e+06                     1.418766e+05 
##                 MINIMUM_PAYMENTS                 PRC_FULL_PAYMENT 
##                     6.838695e+04                     2.608356e-03 
##                           TENURE 
##                     0.000000e+00
# Minimum
sapply(data[, num_cols], min, na.rm = TRUE)
##                          BALANCE                BALANCE_FREQUENCY 
##                         1.591980                         0.636364 
##                        PURCHASES                 ONEOFF_PURCHASES 
##                         0.000000                         0.000000 
##           INSTALLMENTS_PURCHASES                     CASH_ADVANCE 
##                         0.000000                         0.000000 
##              PURCHASES_FREQUENCY       ONEOFF_PURCHASES_FREQUENCY 
##                         0.000000                         0.000000 
## PURCHASES_INSTALLMENTS_FREQUENCY           CASH_ADVANCE_FREQUENCY 
##                         0.000000                         0.000000 
##                 CASH_ADVANCE_TRX                    PURCHASES_TRX 
##                         0.000000                         0.000000 
##                     CREDIT_LIMIT                         PAYMENTS 
##                       150.000000                         4.841543 
##                 MINIMUM_PAYMENTS                 PRC_FULL_PAYMENT 
##                         2.891346                         0.000000 
##                           TENURE 
##                        12.000000
# Maximum
sapply(data[, num_cols], max, na.rm = TRUE)
##                          BALANCE                BALANCE_FREQUENCY 
##                      4393.939007                         1.000000 
##                        PURCHASES                 ONEOFF_PURCHASES 
##                      1957.300000                      1085.630000 
##           INSTALLMENTS_PURCHASES                     CASH_ADVANCE 
##                      1043.180000                      2780.106659 
##              PURCHASES_FREQUENCY       ONEOFF_PURCHASES_FREQUENCY 
##                         1.000000                         0.416667 
## PURCHASES_INSTALLMENTS_FREQUENCY           CASH_ADVANCE_FREQUENCY 
##                         1.000000                         0.416667 
##                 CASH_ADVANCE_TRX                    PURCHASES_TRX 
##                         7.000000                        30.000000 
##                     CREDIT_LIMIT                         PAYMENTS 
##                      7600.000000                      1741.996361 
##                 MINIMUM_PAYMENTS                 PRC_FULL_PAYMENT 
##                      1167.316335                         0.250000 
##                           TENURE 
##                        12.000000
# Quartiles (25%, 50%, 75%)
sapply(data[, num_cols], function(x) quantile(x, probs = c(0.25, 0.5, 0.75), na.rm = TRUE))
##       BALANCE BALANCE_FREQUENCY PURCHASES ONEOFF_PURCHASES
## 25%  274.9226                 1      0.00             0.00
## 50%  955.1848                 1    149.71             0.00
## 75% 1472.5915                 1    420.96           139.85
##     INSTALLMENTS_PURCHASES CASH_ADVANCE PURCHASES_FREQUENCY
## 25%                  0.000      0.00000               0.000
## 50%                  0.000     37.48372               0.250
## 75%                238.085    436.31512               0.625
##     ONEOFF_PURCHASES_FREQUENCY PURCHASES_INSTALLMENTS_FREQUENCY
## 25%                   0.000000                              0.0
## 50%                   0.000000                              0.0
## 75%                   0.083333                              0.5
##     CASH_ADVANCE_FREQUENCY CASH_ADVANCE_TRX PURCHASES_TRX CREDIT_LIMIT PAYMENTS
## 25%               0.000000                0             0         1200 302.1927
## 50%               0.083333                1             3         2000 504.5612
## 75%               0.166667                3            10         3000 762.9291
##     MINIMUM_PAYMENTS PRC_FULL_PAYMENT TENURE
## 25%         187.6265                0     12
## 50%         311.9634                0     12
## 75%         542.5740                0     12
# Percentiles (exemple : 10%, 90%)
sapply(data[, num_cols], function(x) quantile(x, probs = c(0.1, 0.9), na.rm = TRUE))
##        BALANCE BALANCE_FREQUENCY PURCHASES ONEOFF_PURCHASES
## 10%   58.89497          0.818182     0.000            0.000
## 90% 2218.14108          1.000000   702.028          415.236
##     INSTALLMENTS_PURCHASES CASH_ADVANCE PURCHASES_FREQUENCY
## 10%                   0.00        0.000                   0
## 90%                 472.84     1153.266                   1
##     ONEOFF_PURCHASES_FREQUENCY PURCHASES_INSTALLMENTS_FREQUENCY
## 10%                       0.00                                0
## 90%                       0.25                                1
##     CASH_ADVANCE_FREQUENCY CASH_ADVANCE_TRX PURCHASES_TRX CREDIT_LIMIT
## 10%                   0.00                0             0         1000
## 90%                   0.25                5            15         5000
##      PAYMENTS MINIMUM_PAYMENTS PRC_FULL_PAYMENT TENURE
## 10%  187.2024         139.8612         0.000000     12
## 90% 1143.3087         800.7793         0.090909     12
# Vérifier que num_cols n'est pas vide
if (length(num_cols) > 0) {
  # Calculer moyenne et écart-type
  stats <- data.frame(
    variable = num_cols,
    mean = sapply(data[, num_cols], mean, na.rm = TRUE),
    sd = sapply(data[, num_cols], sd, na.rm = TRUE),
    stringsAsFactors = FALSE
  )
  
  # Barplot avec erreur ± écart-type
  print(ggplot(stats, aes(x = reorder(variable, mean), y = mean)) +
    geom_bar(stat="identity", fill="skyblue") +
    geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), width = 0.4) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    labs(title="Moyenne ± Écart-type des variables", y="Valeur", x="Variable"))
} else {
  print("Erreur: Aucune colonne trouvée dans les données")
}

for (col in num_cols) {
  p <- ggplot(data, aes(x = .data[[col]])) +
    geom_histogram(fill="lightgreen", color="black", bins=30) +
    labs(title=paste("Distribution de", col), x=col, y="Fréquence")
  print(p)
}

# Boxplot pour chaque variable
for (col in num_cols) {
  p <- ggplot(data, aes(y = .data[[col]])) +
    geom_boxplot(fill="pink", color="black") +
    labs(title=paste("Boxplot de", col), y=col) +
    theme_minimal() +
    theme(axis.text.x = element_blank(),
          axis.ticks.x = element_blank())
  print(p)
}

Outliers (IQR) – BALANCE

# Détection des outliers via IQR pour BALANCE
Q1 <- quantile(data$BALANCE, 0.25, na.rm = TRUE)
Q3 <- quantile(data$BALANCE, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower <- Q1 - 1.5 * IQR
upper <- Q3 + 1.5 * IQR

outliers_balance <- subset(data, BALANCE < lower | BALANCE > upper)

# Récapitulatif clair (seuils et compte)
summary_outliers <- data.frame(
  variable = "BALANCE",
  lower = lower,
  upper = upper,
  n_outliers = nrow(outliers_balance)
)
print(summary_outliers)
##     variable     lower    upper n_outliers
## 25%  BALANCE -1521.581 3269.095         32
# Top 10 outliers (colonnes clés uniquement)
if (nrow(outliers_balance) > 0) {
  key_cols <- intersect(c("BALANCE","CREDIT_LIMIT","PAYMENTS","PURCHASES","CASH_ADVANCE","PRC_FULL_PAYMENT"), names(outliers_balance))
  outliers_view <- outliers_balance[order(outliers_balance$BALANCE, decreasing = TRUE), key_cols, drop = FALSE]
  print(utils::head(outliers_view, 10))
} else {
  print("Aucun outlier détecté pour BALANCE selon la règle IQR.")
}
##       BALANCE CREDIT_LIMIT PAYMENTS PURCHASES CASH_ADVANCE PRC_FULL_PAYMENT
## 618  4393.939         5300 1026.406    417.08    2126.9616                0
## 1302 4383.572         5000 1161.701      0.00    1805.8535                0
## 1049 4355.982         7500 1188.603      0.00     305.9856                0
## 25   4245.855         5000 1004.355   1957.30       0.0000                0
## 933  4187.551         4500 1216.465      0.00     433.7192                0
## 1070 4128.312         5000 1082.274      0.00    1674.4016                0
## 1665 4071.994         5000 1528.928      0.00    1541.3938                0
## 19   4037.306         4500 1073.845     45.00     104.2381                0
## 972  4035.138         7000 1474.388      0.00    2399.4370                0
## 767  3985.638         6000 1203.181     20.00    1306.1840                0

Visualisation Graphique des Outliers

# 1. Boxplot avec identification des outliers pour BALANCE
ggplot(data, aes(y = BALANCE)) +
  geom_boxplot(fill = "lightblue", outlier.colour = "red", outlier.size = 2) +
  labs(title = "Outliers de BALANCE (en rouge)", 
       y = "Balance") +
  theme_minimal() +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank())

# 2. Boxplots multiples pour toutes les variables numériques
if (length(num_cols) > 0) {
  # Normaliser les données pour comparaison
  data_norm <- as.data.frame(scale(data[, num_cols]))
  data_norm$ID <- 1:nrow(data_norm)
  
  # Transformer en format long pour ggplot
  data_long <- melt(data_norm, id.vars = "ID", variable.name = "Variable", value.name = "Value")
  
  ggplot(data_long, aes(x = Variable, y = Value)) +
    geom_boxplot(fill = "lightgreen", outlier.colour = "red", outlier.size = 1.5) +
    labs(title = "Outliers de toutes les variables (données normalisées)", 
         x = "Variables", 
         y = "Valeur normalisée") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
}
## Warning: Removed 2035 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

# 3. Scatter plot avec identification des outliers pour BALANCE vs CREDIT_LIMIT
if ("CREDIT_LIMIT" %in% names(data)) {
  data$is_outlier_balance <- ifelse(data$BALANCE < lower | data$BALANCE > upper, "Outlier", "Normal")
  
  ggplot(data, aes(x = CREDIT_LIMIT, y = BALANCE, color = is_outlier_balance)) +
    geom_point(alpha = 0.5, size = 2) +
    scale_color_manual(values = c("Normal" = "blue", "Outlier" = "red")) +
    labs(title = "Outliers de BALANCE en fonction de CREDIT_LIMIT", 
         x = "Credit Limit", 
         y = "Balance",
         color = "Type") +
    theme_minimal()
}

# 4. Density plot avec zones d'outliers
ggplot(data, aes(x = BALANCE)) +
  geom_density(fill = "lightblue", alpha = 0.5) +
  geom_vline(xintercept = lower, color = "red", linetype = "dashed", linewidth = 1) +
  geom_vline(xintercept = upper, color = "red", linetype = "dashed", linewidth = 1) +
  annotate("text", x = lower, y = 0, label = paste("Q1-1.5*IQR:", round(lower, 2)), 
           hjust = 1.1, color = "red") +
  annotate("text", x = upper, y = 0, label = paste("Q3+1.5*IQR:", round(upper, 2)), 
           hjust = -0.1, color = "red") +
  labs(title = "Distribution de BALANCE avec limites d'outliers", 
       x = "Balance", 
       y = "Densité") +
  theme_minimal()

# 5. Détection et visualisation des outliers pour toutes les variables
outliers_summary <- data.frame()

for (col in num_cols) {
  Q1 <- quantile(data[[col]], 0.25, na.rm = TRUE)
  Q3 <- quantile(data[[col]], 0.75, na.rm = TRUE)
  IQR_val <- Q3 - Q1
  lower_bound <- Q1 - 1.5 * IQR_val
  upper_bound <- Q3 + 1.5 * IQR_val
  
  n_outliers <- sum(data[[col]] < lower_bound | data[[col]] > upper_bound, na.rm = TRUE)
  pct_outliers <- round(100 * n_outliers / nrow(data), 2)
  
  outliers_summary <- rbind(outliers_summary, data.frame(
    Variable = col,
    Q1 = Q1,
    Q3 = Q3,
    IQR = IQR_val,
    Lower = lower_bound,
    Upper = upper_bound,
    N_Outliers = n_outliers,
    Pct_Outliers = pct_outliers
  ))
}

# Afficher le tableau récapitulatif
print(outliers_summary)
##                               Variable        Q1          Q3         IQR
## 25%                            BALANCE  274.9226 1472.591481 1197.668854
## 25%1                 BALANCE_FREQUENCY    1.0000    1.000000    0.000000
## 25%2                         PURCHASES    0.0000  420.960000  420.960000
## 25%3                  ONEOFF_PURCHASES    0.0000  139.850000  139.850000
## 25%4            INSTALLMENTS_PURCHASES    0.0000  238.085000  238.085000
## 25%5                      CASH_ADVANCE    0.0000  436.315122  436.315122
## 25%6               PURCHASES_FREQUENCY    0.0000    0.625000    0.625000
## 25%7        ONEOFF_PURCHASES_FREQUENCY    0.0000    0.083333    0.083333
## 25%8  PURCHASES_INSTALLMENTS_FREQUENCY    0.0000    0.500000    0.500000
## 25%9            CASH_ADVANCE_FREQUENCY    0.0000    0.166667    0.166667
## 25%10                 CASH_ADVANCE_TRX    0.0000    3.000000    3.000000
## 25%11                    PURCHASES_TRX    0.0000   10.000000   10.000000
## 25%12                     CREDIT_LIMIT 1200.0000 3000.000000 1800.000000
## 25%13                         PAYMENTS  302.1927  762.929061  460.736406
## 25%14                 MINIMUM_PAYMENTS  187.6265  542.573998  354.947543
## 25%15                 PRC_FULL_PAYMENT    0.0000    0.000000    0.000000
## 25%16                           TENURE   12.0000   12.000000    0.000000
##               Lower        Upper N_Outliers Pct_Outliers
## 25%   -1521.5806553 3269.0947627         32         1.57
## 25%1      1.0000000    1.0000000        278        13.66
## 25%2   -631.4400000 1052.4000000         44         2.16
## 25%3   -209.7750000  349.6250000        245        12.04
## 25%4   -357.1275000  595.2125000        112         5.50
## 25%5   -654.4726837 1090.7878062        233        11.45
## 25%6     -0.9375000    1.5625000          0         0.00
## 25%7     -0.1249995    0.2083325        277        13.61
## 25%8     -0.7500000    1.2500000          0         0.00
## 25%9     -0.2500005    0.4166675          0         0.00
## 25%10    -4.5000000    7.5000000          0         0.00
## 25%11   -15.0000000   25.0000000         37         1.82
## 25%12 -1500.0000000 5700.0000000        175         8.60
## 25%13  -388.9119540 1454.0336700         85         4.18
## 25%14  -344.7948600 1074.9953120         36         1.77
## 25%15     0.0000000    0.0000000        313        15.38
## 25%16    12.0000000   12.0000000          0         0.00
# Graphique à barres du pourcentage d'outliers par variable
ggplot(outliers_summary, aes(x = reorder(Variable, -Pct_Outliers), y = Pct_Outliers)) +
  geom_bar(stat = "identity", fill = "coral") +
  geom_text(aes(label = paste0(Pct_Outliers, "%")), vjust = -0.5, size = 3) +
  labs(title = "Pourcentage d'outliers par variable", 
       x = "Variable", 
       y = "% d'outliers") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

## Détection détaillée des outliers pour chaque variable

# Pour chaque variable numérique, créer des visualisations détaillées
for (col in num_cols) {
  cat("\n### Variable:", col, "\n\n")
  
  # Calcul des seuils IQR
  Q1 <- quantile(data[[col]], 0.25, na.rm = TRUE)
  Q3 <- quantile(data[[col]], 0.75, na.rm = TRUE)
  IQR_val <- Q3 - Q1
  lower_bound <- Q1 - 1.5 * IQR_val
  upper_bound <- Q3 + 1.5 * IQR_val
  
  # Identification des outliers
  outliers_mask <- data[[col]] < lower_bound | data[[col]] > upper_bound
  n_outliers <- sum(outliers_mask, na.rm = TRUE)
  pct_outliers <- round(100 * n_outliers / nrow(data), 2)
  
  # Afficher les statistiques
  cat("- Nombre d'outliers:", n_outliers, "/", nrow(data), "(", pct_outliers, "%)\n")
  cat("- Limites: [", round(lower_bound, 2), ",", round(upper_bound, 2), "]\n")
  cat("- Q1 =", round(Q1, 2), ", Q3 =", round(Q3, 2), ", IQR =", round(IQR_val, 2), "\n\n")
  
  # Créer une colonne pour identifier les outliers
  data$outlier_status <- ifelse(outliers_mask, "Outlier", "Normal")
  
  # 1. Boxplot individuel avec outliers en rouge
  p1 <- ggplot(data, aes(y = .data[[col]])) +
    geom_boxplot(fill = "lightblue", outlier.colour = "red", outlier.size = 2) +
    labs(title = paste("Boxplot de", col), 
         y = col) +
    theme_minimal() +
    theme(axis.text.x = element_blank(),
          axis.ticks.x = element_blank())
  print(p1)
  
  # 2. Density plot avec zones d'outliers
  p2 <- ggplot(data, aes(x = .data[[col]])) +
    geom_density(fill = "lightgreen", alpha = 0.5) +
    geom_vline(xintercept = lower_bound, color = "red", linetype = "dashed", linewidth = 1) +
    geom_vline(xintercept = upper_bound, color = "red", linetype = "dashed", linewidth = 1) +
    geom_vline(xintercept = Q1, color = "blue", linetype = "dotted", linewidth = 0.8) +
    geom_vline(xintercept = Q3, color = "blue", linetype = "dotted", linewidth = 0.8) +
    labs(title = paste("Distribution de", col, "avec limites d'outliers"), 
         x = col, 
         y = "Densité") +
    theme_minimal()
  print(p2)
  
  # 3. Scatter plot avec index pour voir la position des outliers
  p3 <- ggplot(data, aes(x = 1:nrow(data), y = .data[[col]], color = outlier_status)) +
    geom_point(alpha = 0.6, size = 1.5) +
    scale_color_manual(values = c("Normal" = "blue", "Outlier" = "red")) +
    labs(title = paste("Outliers de", col, "par observation"), 
         x = "Index", 
         y = col,
         color = "Type") +
    theme_minimal()
  print(p3)
  
  # Afficher quelques exemples d'outliers si présents
  if (n_outliers > 0 & n_outliers <= 20) {
    cat("\nTous les outliers détectés:\n")
    outliers_data <- data[outliers_mask, c(col), drop = FALSE]
    print(outliers_data[order(outliers_data[[col]], decreasing = TRUE), , drop = FALSE])
  } else if (n_outliers > 20) {
    cat("\nTop 10 outliers (valeurs les plus extrêmes):\n")
    outliers_data <- data[outliers_mask, c(col), drop = FALSE]
    print(utils::head(outliers_data[order(abs(outliers_data[[col]] - median(data[[col]], na.rm = TRUE)), decreasing = TRUE), , drop = FALSE], 10))
  }
  
  cat("\n---\n")
}
## 
## ### Variable: BALANCE 
## 
## - Nombre d'outliers: 32 / 2035 ( 1.57 %)
## - Limites: [ -1521.58 , 3269.09 ]
## - Q1 = 274.92 , Q3 = 1472.59 , IQR = 1197.67

## 
## Top 10 outliers (valeurs les plus extrêmes):
##       BALANCE
## 618  4393.939
## 1302 4383.572
## 1049 4355.982
## 25   4245.855
## 933  4187.551
## 1070 4128.312
## 1665 4071.994
## 19   4037.306
## 972  4035.138
## 767  3985.638
## 
## ---
## 
## ### Variable: BALANCE_FREQUENCY 
## 
## - Nombre d'outliers: 278 / 2035 ( 13.66 %)
## - Limites: [ 1 , 1 ]
## - Q1 = 1 , Q3 = 1 , IQR = 0

## 
## Top 10 outliers (valeurs les plus extrêmes):
##     BALANCE_FREQUENCY
## 26           0.636364
## 94           0.636364
## 186          0.636364
## 221          0.636364
## 238          0.636364
## 311          0.636364
## 340          0.636364
## 376          0.636364
## 397          0.636364
## 398          0.636364
## 
## ---
## 
## ### Variable: PURCHASES 
## 
## - Nombre d'outliers: 44 / 2035 ( 2.16 %)
## - Limites: [ -631.44 , 1052.4 ]
## - Q1 = 0 , Q3 = 420.96 , IQR = 420.96

## 
## Top 10 outliers (valeurs les plus extrêmes):
##      PURCHASES
## 25     1957.30
## 1383   1853.70
## 1636   1814.65
## 288    1788.10
## 595    1725.05
## 444    1672.99
## 170    1594.00
## 160    1555.66
## 540    1518.26
## 124    1488.80
## 
## ---
## 
## ### Variable: ONEOFF_PURCHASES 
## 
## - Nombre d'outliers: 245 / 2035 ( 12.04 %)
## - Limites: [ -209.77 , 349.62 ]
## - Q1 = 0 , Q3 = 139.85 , IQR = 139.85

## 
## Top 10 outliers (valeurs les plus extrêmes):
##      ONEOFF_PURCHASES
## 239           1085.63
## 170           1084.00
## 1094          1081.00
## 25            1077.30
## 640           1050.76
## 906           1045.84
## 365           1040.99
## 1046          1029.29
## 31            1025.00
## 540           1024.83
## 
## ---
## 
## ### Variable: INSTALLMENTS_PURCHASES 
## 
## - Nombre d'outliers: 112 / 2035 ( 5.5 %)
## - Limites: [ -357.13 , 595.21 ]
## - Q1 = 0 , Q3 = 238.08 , IQR = 238.08

## 
## Top 10 outliers (valeurs les plus extrêmes):
##      INSTALLMENTS_PURCHASES
## 537                 1043.18
## 199                 1043.00
## 1509                1039.92
## 595                 1025.00
## 543                 1020.76
## 181                 1016.00
## 673                 1013.00
## 641                  996.66
## 235                  972.38
## 1283                 968.64
## 
## ---
## 
## ### Variable: CASH_ADVANCE 
## 
## - Nombre d'outliers: 233 / 2035 ( 11.45 %)
## - Limites: [ -654.47 , 1090.79 ]
## - Q1 = 0 , Q3 = 436.32 , IQR = 436.32

## 
## Top 10 outliers (valeurs les plus extrêmes):
##      CASH_ADVANCE
## 814      2780.107
## 1042     2774.386
## 1423     2721.304
## 1085     2706.494
## 1598     2704.303
## 2031     2625.645
## 870      2617.931
## 1105     2601.135
## 609      2577.953
## 1080     2548.837
## 
## ---
## 
## ### Variable: PURCHASES_FREQUENCY 
## 
## - Nombre d'outliers: 0 / 2035 ( 0 %)
## - Limites: [ -0.94 , 1.56 ]
## - Q1 = 0 , Q3 = 0.62 , IQR = 0.62

## 
## ---
## 
## ### Variable: ONEOFF_PURCHASES_FREQUENCY 
## 
## - Nombre d'outliers: 277 / 2035 ( 13.61 %)
## - Limites: [ -0.12 , 0.21 ]
## - Q1 = 0 , Q3 = 0.08 , IQR = 0.08

## 
## Top 10 outliers (valeurs les plus extrêmes):
##     ONEOFF_PURCHASES_FREQUENCY
## 59                    0.416667
## 73                    0.416667
## 93                    0.416667
## 102                   0.416667
## 237                   0.416667
## 245                   0.416667
## 255                   0.416667
## 261                   0.416667
## 340                   0.416667
## 357                   0.416667
## 
## ---
## 
## ### Variable: PURCHASES_INSTALLMENTS_FREQUENCY 
## 
## - Nombre d'outliers: 0 / 2035 ( 0 %)
## - Limites: [ -0.75 , 1.25 ]
## - Q1 = 0 , Q3 = 0.5 , IQR = 0.5

## 
## ---
## 
## ### Variable: CASH_ADVANCE_FREQUENCY 
## 
## - Nombre d'outliers: 0 / 2035 ( 0 %)
## - Limites: [ -0.25 , 0.42 ]
## - Q1 = 0 , Q3 = 0.17 , IQR = 0.17

## 
## ---
## 
## ### Variable: CASH_ADVANCE_TRX 
## 
## - Nombre d'outliers: 0 / 2035 ( 0 %)
## - Limites: [ -4.5 , 7.5 ]
## - Q1 = 0 , Q3 = 3 , IQR = 3

## 
## ---
## 
## ### Variable: PURCHASES_TRX 
## 
## - Nombre d'outliers: 37 / 2035 ( 1.82 %)
## - Limites: [ -15 , 25 ]
## - Q1 = 0 , Q3 = 10 , IQR = 10

## 
## Top 10 outliers (valeurs les plus extrêmes):
##      PURCHASES_TRX
## 38              30
## 299             30
## 1004            30
## 1391            30
## 1729            30
## 594             29
## 910             29
## 1710            29
## 1816            29
## 322             28
## 
## ---
## 
## ### Variable: CREDIT_LIMIT 
## 
## - Nombre d'outliers: 175 / 2035 ( 8.6 %)
## - Limites: [ -1500 , 5700 ]
## - Q1 = 1200 , Q3 = 3000 , IQR = 1800

## 
## Top 10 outliers (valeurs les plus extrêmes):
##     CREDIT_LIMIT
## 693         7600
## 30          7500
## 31          7500
## 49          7500
## 90          7500
## 133         7500
## 178         7500
## 181         7500
## 322         7500
## 324         7500
## 
## ---
## 
## ### Variable: PAYMENTS 
## 
## - Nombre d'outliers: 85 / 2035 ( 4.18 %)
## - Limites: [ -388.91 , 1454.03 ]
## - Q1 = 302.19 , Q3 = 762.93 , IQR = 460.74

## 
## Top 10 outliers (valeurs les plus extrêmes):
##      PAYMENTS
## 1377 1741.996
## 259  1738.450
## 176  1736.121
## 768  1734.837
## 1080 1732.275
## 1695 1723.351
## 260  1721.926
## 1415 1713.312
## 1127 1713.300
## 1445 1711.865
## 
## ---
## 
## ### Variable: MINIMUM_PAYMENTS 
## 
## - Nombre d'outliers: 36 / 2035 ( 1.77 %)
## - Limites: [ -344.79 , 1075 ]
## - Q1 = 187.63 , Q3 = 542.57 , IQR = 354.95

## 
## Top 10 outliers (valeurs les plus extrêmes):
##      MINIMUM_PAYMENTS
## 735          1167.316
## 1741         1164.898
## 1940         1159.793
## 347          1159.311
## 1665         1157.359
## 683          1155.325
## 618          1152.616
## 767          1148.849
## 936          1147.377
## 605          1145.811
## 
## ---
## 
## ### Variable: PRC_FULL_PAYMENT 
## 
## - Nombre d'outliers: 313 / 2035 ( 15.38 %)
## - Limites: [ 0 , 0 ]
## - Q1 = 0 , Q3 = 0 , IQR = 0

## 
## Top 10 outliers (valeurs les plus extrêmes):
##     PRC_FULL_PAYMENT
## 48              0.25
## 94              0.25
## 150             0.25
## 188             0.25
## 221             0.25
## 251             0.25
## 303             0.25
## 427             0.25
## 444             0.25
## 459             0.25
## 
## ---
## 
## ### Variable: TENURE 
## 
## - Nombre d'outliers: 0 / 2035 ( 0 %)
## - Limites: [ 12 , 12 ]
## - Q1 = 12 , Q3 = 12 , IQR = 0

## 
## ---

Matrice de corrélation des outliers

# Créer une matrice binaire indiquant si chaque observation est un outlier pour chaque variable
outliers_matrix <- data.frame(matrix(0, nrow = nrow(data), ncol = length(num_cols)))
colnames(outliers_matrix) <- num_cols

for (i in seq_along(num_cols)) {
  col <- num_cols[i]
  Q1 <- quantile(data[[col]], 0.25, na.rm = TRUE)
  Q3 <- quantile(data[[col]], 0.75, na.rm = TRUE)
  IQR_val <- Q3 - Q1
  lower_bound <- Q1 - 1.5 * IQR_val
  upper_bound <- Q3 + 1.5 * IQR_val
  
  outliers_matrix[, i] <- ifelse(data[[col]] < lower_bound | data[[col]] > upper_bound, 1, 0)
}

# Vérifier quelles colonnes ont une variance non nulle (au moins un outlier et au moins une valeur normale)
valid_cols <- sapply(outliers_matrix, function(x) {
  var_x <- var(x, na.rm = TRUE)
  !is.na(var_x) && var_x > 0
})

cat("Variables avec outliers détectés:\n")
## Variables avec outliers détectés:
cat(paste(num_cols[valid_cols], collapse = ", "), "\n\n")
## BALANCE, BALANCE_FREQUENCY, PURCHASES, ONEOFF_PURCHASES, INSTALLMENTS_PURCHASES, CASH_ADVANCE, ONEOFF_PURCHASES_FREQUENCY, PURCHASES_TRX, CREDIT_LIMIT, PAYMENTS, MINIMUM_PAYMENTS, PRC_FULL_PAYMENT
if (sum(valid_cols) > 1) {
  # Calculer la corrélation uniquement pour les colonnes avec variance non nulle
  cor_outliers <- cor(outliers_matrix[, valid_cols], use = "complete.obs")
  
  # Heatmap de corrélation
  cor_melted <- melt(cor_outliers)
  print(ggplot(cor_melted, aes(x = Var1, y = Var2, fill = value)) +
    geom_tile() +
    geom_text(aes(label = round(value, 2)), size = 3) +
    scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                         midpoint = 0, limit = c(-1, 1)) +
    labs(title = "Corrélation entre les outliers des variables", 
         x = "", y = "", fill = "Corrélation") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)))
  
  # Afficher les variables qui ont tendance à avoir des outliers ensemble
  cat("\nVariables avec forte corrélation d'outliers (> 0.5):\n")
  high_cor <- which(cor_outliers > 0.5 & cor_outliers < 1, arr.ind = TRUE)
  if (nrow(high_cor) > 0) {
    for (i in 1:nrow(high_cor)) {
      var1 <- rownames(cor_outliers)[high_cor[i, 1]]
      var2 <- colnames(cor_outliers)[high_cor[i, 2]]
      cor_val <- cor_outliers[high_cor[i, 1], high_cor[i, 2]]
      cat(var1, "<->", var2, ":", round(cor_val, 3), "\n")
    }
  } else {
    cat("Aucune forte corrélation détectée.\n")
  }
} else {
  cat("Impossible de calculer la corrélation : pas assez de variables avec des outliers.\n")
  if (sum(valid_cols) == 0) {
    cat("Aucune variable n'a d'outliers détectés.\n")
  } else {
    cat("Une seule variable a des outliers :", num_cols[valid_cols], "\n")
  }
}

## 
## Variables avec forte corrélation d'outliers (> 0.5):
## ONEOFF_PURCHASES_FREQUENCY <-> ONEOFF_PURCHASES : 0.527 
## ONEOFF_PURCHASES <-> ONEOFF_PURCHASES_FREQUENCY : 0.527